{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import multiprocessing as mp\n",
    "import time\n",
    "import json\n",
    "\n",
    "# My implementation of a Hidden Markov Model\n",
    "from HiddenMarkovModel import HiddenMarkovModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading: /home/tabz/Documents/CMU_Dataset/r4.1/r41_features_simple.h5\n",
      "There are 1000 users\n",
      "Using 7 features\n",
      "Queueing up jobs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [22:34<00:00,  1.52s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress on those jobs:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 1000/1000 [1:19:10<00:00,  4.55s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving: userScores_simple_r41_80_inertia.json\n",
      "Loading: /home/tabz/Documents/CMU_Dataset/r4.1/r41_features_complex.h5\n",
      "There are 1000 users\n",
      "Using 16 features\n",
      "Queueing up jobs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [22:36<00:00,  1.25s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress on those jobs:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 1000/1000 [1:29:02<00:00,  5.34s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving: userScores_complex_r41_80_inertia.json\n",
      "Loading: /home/tabz/Documents/CMU_Dataset/r4.2/r42_features_simple.h5\n",
      "There are 1000 users\n",
      "Using 7 features\n",
      "Queueing up jobs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [21:44<00:00,  1.26s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress on those jobs:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 1000/1000 [1:15:43<00:00,  3.55s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving: userScores_simple_r42_80_inertia.json\n",
      "Loading: /home/tabz/Documents/CMU_Dataset/r4.2/r42_features_complex.h5\n",
      "There are 1000 users\n",
      "Using 16 features\n",
      "Queueing up jobs\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 1000/1000 [21:42<00:00,  1.33s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Progress on those jobs:\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "100%|██████████| 1000/1000 [1:27:29<00:00,  3.66s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saving: userScores_complex_r42_80_inertia.json\n"
     ]
    }
   ],
   "source": [
    "names = [ \"r4.1/r41_features_simple.h5\", \"r4.1/r41_features_complex.h5\", \"r4.2/r42_features_simple.h5\", \"r4.2/r42_features_complex.h5\" ]\n",
    "saves = [ \"userScores_simple_r41_80_inertia.json\", \"userScores_complex_r41_80_inertia.json\", \"userScores_simple_r42_80_inertia.json\", \"userScores_complex_r42_80_inertia.json\"]\n",
    "assert(len(names) == len(saves))\n",
    "for name,save in zip(names,saves):\n",
    "    \n",
    "    filename = \"/home/tabz/Documents/CMU_Dataset/\" + name\n",
    "    \n",
    "    print(\"Loading:\", filename)\n",
    "    joint = pd.read_hdf(filename, \"table\")\n",
    "    users = np.unique(joint.index.values)\n",
    "    print(\"There are\", users.size, \"users\")\n",
    "    num_features = np.unique(joint[\"feature\"].values).size\n",
    "    print(\"Using\", num_features, \"features\")\n",
    "    \n",
    "    -\n",
    "    \n",
    "    def compute_probs(user_df):\n",
    "        dayGrouping = pd.Grouper(key=\"date\", freq=\"1D\")\n",
    "        weekGrouping = pd.Grouper(key=\"date\", freq=\"1W\")\n",
    "        timeGrouping = user_df.groupby(weekGrouping)\n",
    "\n",
    "        # print(\"Starting on user: \", user)\n",
    "        s,t,e = init_matrices()\n",
    "        model = HiddenMarkovModel.HMM(num_states, t,e,s)\n",
    "\n",
    "        trainingPeriod = 4\n",
    "        timesTrained = 0\n",
    "\n",
    "        logProbScores = []\n",
    "\n",
    "        for name, group in timeGrouping:\n",
    "\n",
    "            #The sequence for the time grouping we are considering\n",
    "            seq = group[\"feature\"].values\n",
    "\n",
    "            if len(seq) < 1:\n",
    "                # If there is no activity for this week \n",
    "                logProbScores.append(0)\n",
    "                continue\n",
    "\n",
    "            if timesTrained > trainingPeriod:\n",
    "                logProb = model.sequence_log_probability(seq)\n",
    "                logProbScores.append(-logProb)\n",
    "\n",
    "                #Train the model on the sequence we have just seen\n",
    "            model.learn(seq, max_iters=20, threshold=0.01, restart_threshold=0.1,max_restarts=5, inertia=0.8)\n",
    "            timesTrained+=1\n",
    "\n",
    "        return logProbScores\n",
    "\n",
    "    userScores = {}\n",
    "\n",
    "    def setInDict(u):\n",
    "        def z(r):\n",
    "            userScores[u] = r\n",
    "        return z\n",
    "\n",
    "    pool = mp.Pool(processes=8)\n",
    "    print(\"Queueing up jobs\", flush=True)\n",
    "    for user in tqdm(users):\n",
    "        pool.apply_async(compute_probs, args=(joint.loc[ joint.index == user ],), callback=setInDict(user))\n",
    "    #     compute_probs(joint.loc[user])\n",
    "    print(\"Progress on those jobs:\", flush=True)    \n",
    "    done = 0\n",
    "    for i in tqdm(users):\n",
    "        while done >= len(userScores):\n",
    "            time.sleep(3)\n",
    "        done += 1\n",
    "    pool.close()\n",
    "    pool.join()\n",
    "    \n",
    "    print(\"Saving:\", save)\n",
    "    json.dump(userScores, open(save, \"w\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
